library(here)
library(tidyverse)
library(conflicted)
# library(easystats)
exoplanets <- read_csv(here("data", "exoplanet_catalog_080325.csv"))
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 7418 Columns: 98── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (12): name, planet_status, publication, detection_type, mass_measurement_type, radius_measurement_type, alternate_names, molecules, star_name, star_sp_ty...
dbl (83): mass, mass_error_min, mass_error_max, mass_sini, mass_sini_error_min, mass_sini_error_max, radius, radius_error_min, radius_error_max, orbital_peri...
lgl (2): hot_point_lon, star_magnetic_field
date (1): updated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
exoplanets
library(skimr)
skim(exoplanets)
Warning: There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names), mangled_skimmers$funs)`.
ℹ In group 0: .
Caused by warning:
! There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names), mangled_skimmers$funs)`.
Caused by warning in `inline_hist()`:
! Variable contains Inf or -Inf value(s) that were converted to NA.
── Data Summary ────────────────────────
Values
Name exoplanets
Number of rows 7418
Number of columns 98
_______________________
Column type frequency:
character 12
Date 1
logical 2
numeric 83
________________________
Group variables None
library(naniar)
gg_miss_var(exoplanets)
library(visdat)
vis_dat(exoplanets)
names(exoplanets)
[1] "name" "planet_status" "mass" "mass_error_min" "mass_error_max"
[6] "mass_sini" "mass_sini_error_min" "mass_sini_error_max" "radius" "radius_error_min"
[11] "radius_error_max" "orbital_period" "orbital_period_error_min" "orbital_period_error_max" "semi_major_axis"
[16] "semi_major_axis_error_min" "semi_major_axis_error_max" "eccentricity" "eccentricity_error_min" "eccentricity_error_max"
[21] "inclination" "inclination_error_min" "inclination_error_max" "angular_distance" "discovered"
[26] "updated" "omega" "omega_error_min" "omega_error_max" "tperi"
[31] "tperi_error_min" "tperi_error_max" "tconj" "tconj_error_min" "tconj_error_max"
[36] "tzero_tr" "tzero_tr_error_min" "tzero_tr_error_max" "tzero_tr_sec" "tzero_tr_sec_error_min"
[41] "tzero_tr_sec_error_max" "lambda_angle" "lambda_angle_error_min" "lambda_angle_error_max" "impact_parameter"
[46] "impact_parameter_error_min" "impact_parameter_error_max" "tzero_vr" "tzero_vr_error_min" "tzero_vr_error_max"
[51] "k" "k_error_min" "k_error_max" "temp_calculated" "temp_calculated_error_min"
[56] "temp_calculated_error_max" "temp_measured" "hot_point_lon" "geometric_albedo" "geometric_albedo_error_min"
[61] "geometric_albedo_error_max" "log_g" "publication" "detection_type" "mass_measurement_type"
[66] "radius_measurement_type" "alternate_names" "molecules" "star_name" "ra"
[71] "dec" "mag_v" "mag_i" "mag_j" "mag_h"
[76] "mag_k" "star_distance" "star_distance_error_min" "star_distance_error_max" "star_metallicity"
[81] "star_metallicity_error_min" "star_metallicity_error_max" "star_mass" "star_mass_error_min" "star_mass_error_max"
[86] "star_radius" "star_radius_error_min" "star_radius_error_max" "star_sp_type" "star_age"
[91] "star_age_error_min" "star_age_error_max" "star_teff" "star_teff_error_min" "star_teff_error_max"
[96] "star_detected_disc" "star_magnetic_field" "star_alternate_names"
library(janitor)
exoplanets %>% tabyl(planet_status)
planet_status n percent
Confirmed 7418 1
library(data.table)
# options(repr.matrix.max.rows=100)
exoplanets %>%
add_prop_miss() %>%
arrange(prop_miss_all) %>%
head(5) %>%
data.table::transpose(keep.names="column") -> preview
preview
# preview %>% View()
We have a lot of features: - Planet name - Mass (M jup) - Mass*sin(i) (M jup) - This describes minimum mass of the planet due to inclination effect - Radius (Rjup) - Period (day) - a / the average distance of the planet and its star - it’s in AU (astronomical units), which is the standard distance used for these types of things - 1 AU is the average distance tween the earth and the sun - e / eccentry of a planet (between 0 and 1) - represenets how much of a circle is the orbit - e = 0 means perfect circle, e > 1 means its not bound to the star - Discovery - year when it was discovered - update - year it was updated -
conflicts_prefer(dplyr::filter)
[conflicted] Removing existing preference.[conflicted] Will prefer dplyr::filter over any other package.
exoplanets %>%
filter(name %>% str_like("%TOI-784%"))
conflicts_prefer(dplyr::filter)
[conflicted] Removing existing preference.[conflicted] Will prefer dplyr::filter over any other package.
exoplanets %>%
filter(discovered == 2023)
exoplanets %>%
mutate(
ra_rad = ra, # Convert RA to radians
dec_rad = dec # Convert Dec to radians
) %>%
ggplot(aes(x = ra_rad, y = dec_rad, color = dec)) +
geom_point(size = 0.4) +
coord_map("aitoff") + # Apply Aitoff projection
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none" # Optionally remove legend
)
# check columsn that start with star
exoplanets %>%
select(starts_with("star"))
library(dplyr)
library(plotly)
# Create a new column to distinguish Kepler exoplanets
exoplanets_3d <- exoplanets %>%
mutate(
ra_rad = ra * pi / 180, # Convert RA from degrees to radians
dec_rad = dec * pi / 180, # Convert Dec from degrees to radians
x = cos(dec_rad) * cos(ra_rad), # Convert to Cartesian coordinates
y = cos(dec_rad) * sin(ra_rad),
z = sin(dec_rad),
color = case_when( # Create a column for red when kepler, blue otherwise
str_detect(paste(name, alternate_names), regex("kepler|koi", ignore_case = TRUE)) ~ "Kepler",
# if it's free floating (star_name is NA)
star_name %>% is.na() ~ "Free Floating",
TRUE ~ "Other"
),
hover_text = paste("Name: ", name) # Create custom hover text with the name of the exoplanet
)
# Define steps for opacity slider
steps <- list(
list(args = list("marker.opacity", 0.0), label = "0.0", method = "restyle"),
list(args = list("marker.opacity", 0.1), label = "0.1", method = "restyle"),
list(args = list("marker.opacity", 0.2), label = "0.2", method = "restyle"),
list(args = list("marker.opacity", 0.3), label = "0.3", method = "restyle"),
list(args = list("marker.opacity", 0.4), label = "0.4", method = "restyle"),
list(args = list("marker.opacity", 0.5), label = "0.5", method = "restyle"),
list(args = list("marker.opacity", 0.6), label = "0.6", method = "restyle"),
list(args = list("marker.opacity", 0.7), label = "0.7", method = "restyle"),
list(args = list("marker.opacity", 0.8), label = "0.8", method = "restyle"),
list(args = list("marker.opacity", 0.9), label = "0.9", method = "restyle"),
list(args = list("marker.opacity", 1.0), label = "1.0", method = "restyle")
)
# Create an interactive 3D scatter plot with plotly
plot_ly(
data = exoplanets_3d,
x = ~x,
y = ~y,
z = ~z,
color = ~color, # Use the kepler_highlight column for color mapping
colors = c("Other" = "red", "Kepler" = "blue", "Free Floating" = "green"),
text = ~hover_text, # Show the name of the exoplanet on hover
type = "scatter3d",
mode = "markers",
marker = list(size = 1, opacity = 0.7), # Default opacity
showlegend = TRUE
) %>%
layout(
title = "3D Sky Map of Exoplanets (Kepler Highlighted)",
scene = list(
xaxis = list(title = "X"),
yaxis = list(title = "Y"),
zaxis = list(title = "Z")
),
sliders = list(
list(
active = 1, # Set the default opacity value to 1.0 (fully opaque)
currentvalue = list(
prefix = "Opacity: ",
font = list(size = 15)
),
pad = list(t = 60),
steps = steps # Use the steps defined earlier for the opacity slider
)
)
)
Warning: Ignoring 1 observationsWarning: Ignoring 1 observations
# Assuming your data is loaded as 'exoplanets'
# Convert RA to degrees (if it's in hours:minutes:seconds format)
# If RA is already in degrees, skip this step
exoplanets %>%
mutate(
ra_deg = ra, # Convert RA from hours to degrees (if needed)
# Convert to polar coordinates for plotting
# RA is mapped to theta (0-360 degrees)
theta = ra_deg
) %>%
ggplot(aes(x = theta, y = star_distance, color = mass)) +
# Use coord_polar for circular plot
coord_polar(start = 0, direction = -1) + # Start at 0 degrees, clockwise direction
# Add concentric circles for distance reference
geom_hline(yintercept = c(10, 100, 1000, 10000),
color = "gray", linetype = "solid", size = 0.3, alpha = 0.7) +
# Add radial lines for angle reference
geom_vline(xintercept = seq(0, 330, by = 30),
color = "gray", linetype = "solid", size = 0.3, alpha = 0.7) +
# Plot the exoplanets
geom_point(alpha = 0.8, size = 1) +
# Use log scale for distance
scale_y_log10(
breaks = c(10, 100, 1000, 10000),
labels = c("10 pc", "100 pc", "1000 pc", "10000 pc"),
limits = c(1, 15000)
) +
# Use log scale for mass colors
scale_color_gradientn(
colors = c("#1E90FF", "#32CD32", "#FFFF00", "#FFA500", "#FF4500", "#FF0000"),
trans = "log10",
breaks = c(0.0001, 0.001, 0.01, 0.1, 1, 10),
labels = c("10⁻⁴", "10⁻³", "10⁻²", "10⁻¹", "10⁰", "10¹"),
name = "Planetary Mass (MJup)"
) +
# Remove grid and axis elements
theme_minimal() +
theme(
axis.title = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
panel.grid = element_blank(),
legend.position = "bottom",
legend.box = "horizontal",
plot.title = element_text(hjust = 0.5)
) +
ggtitle("Exoplanet Distribution")
library(dplyr)
library(plotly)
# Create a new column to distinguish Kepler exoplanets
exoplanets_3d <- exoplanets %>%
mutate(
ra_rad = ra * pi / 180, # Convert RA from degrees to radians
dec_rad = dec * pi / 180, # Convert Dec from degrees to radians
x = cos(dec_rad) * cos(ra_rad), # Convert to Cartesian coordinates
y = cos(dec_rad) * sin(ra_rad),
z = sin(dec_rad),
color = case_when( # Create a column for red when kepler, blue otherwise
str_detect(paste(name, alternate_names), regex("kepler|koi", ignore_case = TRUE)) ~ "Kepler",
# if it's free floating (star_name is NA)
star_name %>% is.na() ~ "Free Floating",
TRUE ~ "Other"
),
hover_text = paste("Name: ", name), # Create custom hover text with the name of the exoplanet
scaled_x = x * (1 / star_distance), # Adjust x coordinate by star distance (closer = closer to center)
scaled_y = y * (1 / star_distance), # Adjust y coordinate similarly
scaled_z = z * (1 / star_distance) # Adjust z coordinate similarly
)
# Define steps for opacity slider
steps <- list(
list(args = list("marker.opacity", 0.0), label = "0.0", method = "restyle"),
list(args = list("marker.opacity", 0.1), label = "0.1", method = "restyle"),
list(args = list("marker.opacity", 0.2), label = "0.2", method = "restyle"),
list(args = list("marker.opacity", 0.3), label = "0.3", method = "restyle"),
list(args = list("marker.opacity", 0.4), label = "0.4", method = "restyle"),
list(args = list("marker.opacity", 0.5), label = "0.5", method = "restyle"),
list(args = list("marker.opacity", 0.6), label = "0.6", method = "restyle"),
list(args = list("marker.opacity", 0.7), label = "0.7", method = "restyle"),
list(args = list("marker.opacity", 0.8), label = "0.8", method = "restyle"),
list(args = list("marker.opacity", 0.9), label = "0.9", method = "restyle"),
list(args = list("marker.opacity", 1.0), label = "1.0", method = "restyle")
)
# Create an interactive 3D scatter plot with plotly
fig <- plot_ly(
data = exoplanets_3d,
x = ~scaled_x,
y = ~scaled_y,
z = ~scaled_z,
color = ~color, # Use the kepler_highlight column for color mapping
colors = c("Other" = "red", "Kepler" = "blue", "Free Floating" = "green"),
text = ~hover_text, # Show the name of the exoplanet on hover
type = "scatter3d",
mode = "markers",
marker = list(size = 2, opacity = 0.7), # Default opacity
showlegend = TRUE
)
# Add layout with a slider for opacity
fig <- fig %>% layout(
title = "3D Sky Map of Exoplanets (Kepler Highlighted)",
scene = list(
xaxis = list(title = "X"),
yaxis = list(title = "Y"),
zaxis = list(title = "Z")
),
sliders = list(
list(
active = 1, # Set the default opacity value to 1.0 (fully opaque)
currentvalue = list(
prefix = "Opacity: ",
font = list(size = 15)
),
pad = list(t = 60),
steps = steps # Use the steps defined earlier for the opacity slider
)
)
)
fig
Warning: Ignoring 357 observationsWarning: Ignoring 357 observations
exoplanets %>% names()
[1] "name" "planet_status" "mass" "mass_error_min" "mass_error_max"
[6] "mass_sini" "mass_sini_error_min" "mass_sini_error_max" "radius" "radius_error_min"
[11] "radius_error_max" "orbital_period" "orbital_period_error_min" "orbital_period_error_max" "semi_major_axis"
[16] "semi_major_axis_error_min" "semi_major_axis_error_max" "eccentricity" "eccentricity_error_min" "eccentricity_error_max"
[21] "inclination" "inclination_error_min" "inclination_error_max" "angular_distance" "discovered"
[26] "updated" "omega" "omega_error_min" "omega_error_max" "tperi"
[31] "tperi_error_min" "tperi_error_max" "tconj" "tconj_error_min" "tconj_error_max"
[36] "tzero_tr" "tzero_tr_error_min" "tzero_tr_error_max" "tzero_tr_sec" "tzero_tr_sec_error_min"
[41] "tzero_tr_sec_error_max" "lambda_angle" "lambda_angle_error_min" "lambda_angle_error_max" "impact_parameter"
[46] "impact_parameter_error_min" "impact_parameter_error_max" "tzero_vr" "tzero_vr_error_min" "tzero_vr_error_max"
[51] "k" "k_error_min" "k_error_max" "temp_calculated" "temp_calculated_error_min"
[56] "temp_calculated_error_max" "temp_measured" "hot_point_lon" "geometric_albedo" "geometric_albedo_error_min"
[61] "geometric_albedo_error_max" "log_g" "publication" "detection_type" "mass_measurement_type"
[66] "radius_measurement_type" "alternate_names" "molecules" "star_name" "ra"
[71] "dec" "mag_v" "mag_i" "mag_j" "mag_h"
[76] "mag_k" "star_distance" "star_distance_error_min" "star_distance_error_max" "star_metallicity"
[81] "star_metallicity_error_min" "star_metallicity_error_max" "star_mass" "star_mass_error_min" "star_mass_error_max"
[86] "star_radius" "star_radius_error_min" "star_radius_error_max" "star_sp_type" "star_age"
[91] "star_age_error_min" "star_age_error_max" "star_teff" "star_teff_error_min" "star_teff_error_max"
[96] "star_detected_disc" "star_magnetic_field" "star_alternate_names"
# check how many are missing
exoplanets %>%
select(ra, dec, angular_distance) %>%
mutate(ra = ra %>% is.na(), dec = dec %>% is.na(), angular_distance = angular_distance %>% is.na()) %>%
summarise_all(mean) %>%
gather(key="column", value="percentage")
# check which ones dont have ra
exoplanets %>%
filter(ra %>% is.na())
# check out alternate names
exoplanets %>%
select(name, alternate_names) %>%
filter(alternate_names %>% str_length() > 0)
NA
exoplanets %>%
tabyl(publication)
publication n percent
Announced on a professional conference 55 0.007414397
Announced on a website 2357 0.317740631
Published in a refereed paper 4873 0.656915611
Submitted to a professional journal 133 0.017929361
# remove any column with error in the name
exoplanets_r <- exoplanets %>%
select(-contains("error")) %>%
select(-planet_status, -updated, -alternate_names, -publication) %>% # useless
select(-hot_point_lon, ) # too many missings
exoplanets_r %>% names
library(visdat)
vis_dat(exoplanets_r)
vis_miss(exoplanets_r, sort_miss = T, cluster = T)
exoplanets %>%
tabyl("detection_type") %>%
arrange(-n)
library(fastDummies)
exoplanets_rd <- exoplanets_r %>%
dummy_cols(select_columns = "detection_type", split = ", ") %>%
# make them bools
mutate_at(vars(starts_with("detection_type_")), as.logical)
exoplanets_rd %>% select(starts_with("detection_type")) %>%
unique
library(naniar)
exoplanets_rd %>%
group_by(`detection_type_Primary Transit`) %>%
miss_var_summary() %>%
arrange(variable) %>%
filter(variable %>% str_detect("detection_type", negate = T)) %>%
ggplot(aes(x = variable, y = pct_miss, fill = `detection_type_Primary Transit`)) +
geom_col(position="dodge") +
coord_flip()
# filter by the kepler
exoplanets %>%
filter(name %>% str_like("%Kepler%")) %>%
tabyl("detection_type")
# check other
exoplanets %>%
filter(detection_type == "Other")